> library(knitr)
> library(extrafont)
> library(stringr)
> library(ggplot2)
> library(dplyr)
> library(gplots)
> 
> # remove scientific notation unless the numbers get really big
> options(scipen = 5)
> 
> experiment = c(rep("P10_control", 6), rep("P10_rescue", 6), rep("P13_control", 
+     6), rep("P13_rescue", 6))
> experiment = rep(experiment, 16)
> age = unname(sapply(experiment, function(x) {
+     strsplit(x, "_")[[1]][1]
+ }))
> status = unname(sapply(experiment, function(x) {
+     strsplit(x, "_")[[1]][2]
+ }))
> metadata = data.frame(experiment = experiment, age = age, status = status)
> 
> umi_count_file = "../data/retinaAligned.out.cleaned.counts_umi.gz"
> 
> 
> umi_counts = function(umi_count_file) {
+     plate = tbl_df(read.table(gzfile(umi_count_file), sep = "\t"))
+     colnames(plate) = c("umi", "gene", "cell", "counts")
+     byumi = plate %>% group_by(umi) %>% summarise(sum = sum(counts))
+     byumi = subset(byumi, !grepl("N", umi))
+     byumi$umi = as.character(byumi$umi)
+     return(byumi)
+ }

Exploratory analysis

There is not a huge skew in UMI distribution towards G-repeats that we have seen with some other libraries, the top UMI tags look pretty reasonable:

> umi = umi_counts(umi_count_file)
> umi = umi[order(-umi$sum), ]
> kable(head(umi, 25), format = "html")
umi sum
ACAAGTGAAA 14760
GGGGTAGAAT 14117
GCGTGTGCTA 13299
GGATGATTGT 12616
GGGACGATTT 12597
CGCCAACAGT 12408
CGTTACTTGC 11713
GGCAGGTCAA 11631
CGGGGAGCGT 11578
AAATAGAAAC 11346
GCGGAGAAGA 11338
AGCCCGTCGT 10981
GATGAGGGGT 10958
TGGGGGGTGG 10793
GCTCTCATGG 10762
AACGGTAGGG 10698
AAACTGTACA 10689
TGGAGCTGAG 10671
GGGGGCTTTA 10610
TATGAAAGGC 10575
GTGCACAGGG 10505
ACAGGAGGCG 10354
TTCTGTAGAG 10161
GCTGTTGTCG 10135
GGACGACGGA 10134

The UMI content is slightly skewed, with UMI with a higher GC content more likely to be sequenced:

> umi$G = str_count(umi$umi, "G")
> umi$T = str_count(umi$umi, "T")
> umi$C = str_count(umi$umi, "C")
> umi$A = str_count(umi$umi, "A")
> umi$GC = (umi$G + umi$C)/(umi$G + umi$C + umi$T + umi$A)
> ggplot(umi, aes(as.factor(GC), sum)) + geom_boxplot() + scale_y_log10() + xlab("GC content") + 
+     ylab("reads mapped") + theme_bw(base_size = 12, base_family = "Gill Sans MT") + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"))

> sort_samplenames = function(samplenames) {
+     sample = unlist(lapply(str_split(samplenames, "[.]"), function(x) {
+         x[1]
+     }))
+     ends = unlist(lapply(str_split(samplenames, "[.]"), function(x) {
+         x[2]
+     }))
+     plate = unlist(lapply(str_split(samplenames, "_"), function(x) {
+         x[1]
+     }))
+     well = unlist(lapply(str_split(samplenames, "_"), function(x) {
+         x[2]
+     }))
+     row = substr(well, 1, 1)
+     plate = as.numeric(substr(well, 2, 3))
+     df = data.frame(id = samplenames, row = row, plate = plate)
+     ord = df[order(df$row, df$plate), ]
+     return(as.character(ord$id))
+ }
> 
> counts_from_umi = function(umi_count_file, tag) {
+     plate = tbl_df(read.table(gzfile(umi_count_file), sep = "\t"))
+     colnames(plate) = c("umi", "gene", "cell", "counts")
+     byumi = plate %>% group_by(umi) %>% summarise(sum = sum(counts))
+     umi_to_keep = subset(byumi, !grepl("N", umi))$umi
+     umi_filter = subset(plate, plate$umi %in% umi_to_keep)
+     umi_filter$counts_umi = 1
+     bygene_sample = umi_filter %>% group_by(gene, cell) %>% summarise(sum = sum(counts_umi))
+     bygene_sample = data.frame(bygene_sample)
+     counts = reshape(bygene_sample, timevar = "cell", idvar = "gene", direction = "wide")
+     counts[is.na(counts)] = 0
+     row.names(counts) = counts$gene
+     counts$gene = NULL
+     colnames(counts) = lapply(colnames(counts), function(x) {
+         paste(tag, strsplit(x, "sum.")[[1]][2], sep = ".")
+     })
+     counts = counts[order(rownames(counts)), sort_samplenames(colnames(counts))]
+     return(counts)
+ }
> 
> umi = counts_from_umi(umi_count_file, "retina")
> colnames(umi) = unname(sapply(colnames(umi), function(x) {
+     strsplit(x, "_")[[1]][2]
+ }))
> metadata$well = colnames(umi)
> metadata$row = substr(colnames(umi), 1, 1)

There are only a small number of genes detected in each well.

> qplot(colSums(umi > 0)) + geom_histogram() + xlab("genes detected per well") + 
+     theme_bw(base_size = 12, base_family = "Gill Sans MT") + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"))

There are only a small number of counts per well, less than 1000:

> summary(colSums(umi))
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   12.0   172.5   404.5   705.1   747.0 11630.0 
> qplot(colSums(umi)) + geom_histogram() + xlab("total counts per well") + theme_bw(base_size = 12, 
+     base_family = "Gill Sans MT") + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"))

Most detected genes only have a small number of reads associated with them with a large number of genes just having one read of evidence.

> qplot(rowSums(umi)) + geom_histogram() + xlab("total reads per gene") + theme_bw(base_size = 12, 
+     base_family = "Gill Sans MT") + scale_x_log10() + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"))

The P10 and P13 cells have different distributions of counts which will confound things somewhat:

> df = data.frame(well = colnames(umi), counts = colSums(umi))
> m = merge(df, metadata, by = "well", all.x = TRUE)
> ggplot(m, aes(well, counts)) + geom_bar(stat = "identity") + facet_wrap(~experiment) + 
+     ylab("counts per well") + theme_bw(base_size = 12, base_family = "Gill Sans MT") + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.ticks.x = element_blank(), 
+         axis.text.x = element_blank())

There isn’t really any clustering of the samples based on age or rescue status, which is not good if we are hoping to call anything DE.

> library(CHBUtils)
> mds(cor(umi), condition = experiment)

> mds(cor(umi), condition = metadata$row)

Hierarchical clustering shows this as well, the colors at the top are the different experiments. This data has each gene normalized by the maximum count for that gene.

> exp_colors = data.frame(experiment = c("P10_control", "P10_rescue", "P13_control", 
+     "P13_rescue"), colors = c("blue", "red", "yellow", "green"))
> colors = merge(metadata, exp_colors, by = "experiment", all.x = TRUE)
> 
> m = as.matrix(umi)
> heatmap(m/apply(m, 1, max), zlim = c(0, 1), col = gray.colors(100), Rowv = NA, 
+     labRow = NA, scale = "none", ColSideColors = as.character(colors$colors))

Differential expression

Based on the above we shouldn’t really be expecting there to be any differences. I tried running SCDE for all the samples, P10 vs P13 and rescue vs control and neither came out as anything. I also treated P10 and P13 as a batch effect and looked at the effect of rescue vs control and couldn’t find anything either.

> library(scde)
> 
> scde_count_prep = function(counts, groups, batch = NULL, sample_min = 5000, 
+     sample_max = Inf) {
+     keep_genes = rowSums(counts) > 0
+     keep_samples = (colSums(counts) > sample_min) & (colSums(counts) < sample_max)
+     groups = droplevels(groups[keep_samples])
+     counts = counts[keep_genes, keep_samples]
+     names(groups) = colnames(counts)
+     if (!any(unlist(lapply(batch, is.na)))) {
+         batch = droplevels(batch[keep_samples])
+         names(batch) = colnames(counts)
+     }
+     return(list(counts = counts, groups = groups, batch = batch))
+ }
> 
> 
> scde_de = function(scde) {
+     o.ifm = scde.error.models(counts = scde$counts, groups = scde$groups, n.cores = 1, 
+         threshold.segmentation = TRUE, save.crossfit.plots = FALSE, save.model.plots = FALSE, 
+         verbose = 2)
+     valid.cells = o.ifm$corr.a > 0
+     o.ifm = o.ifm[valid.cells, ]
+     groups = scde$groups[valid.cells]
+     names(groups) = row.names(o.ifm)
+     if (!any(unlist(lapply(scde$batch, is.na)))) {
+         batch = scde$batch[valid.cells]
+         names(batch) = row.names(o.ifm)
+     }
+     counts = scde$counts[, valid.cells]
+     o.prior = scde.expression.prior(models = o.ifm, counts = scde$counts, length.out = 400, 
+         show.plot = T)
+     ediff = scde.expression.difference(o.ifm, scde$counts, o.prior, groups = groups, 
+         batch = batch, n.randomizations = 100, verbose = 1)
+     if (!any(unlist(lapply(scde$batch, is.na)))) {
+         ediff = ediff$batch.adjusted
+     }
+     
+     ediff$pvalue = pnorm(-(abs(ediff$Z))) * 2
+     ediff$padj = p.adjust(ediff$pvalue)
+     return(ediff)
+ }
> s = scde_count_prep(umi, metadata$status, metadata$age, 1000, 20000)
> d = scde_de(s)
cross-fitting cells.
building individual error models.

controlling for batch effects. interaction:
         batch
groups    P10 P13
  control  28   6
  rescue   28   5
calculating batch posteriors
calculating batch differences
calculating difference posterior
summarizing differences
adjusting for batch effects
> table(d$padj < 0.05)

FALSE 
10082 
> library(reshape)
> combined = read.table("../data/bulk/annotated_combined.counts", sep = "\t", 
+     header = TRUE)
> combined$symbol = toupper(combined$symbol)
> summarydata = read.table("../data/bulk/project-summary.csv", sep = ",", header = TRUE)
> 
> load_gene_list = function(in_fn) {
+     gene_list = read.table(in_fn, header = FALSE)
+     colnames(gene_list) = "symbol"
+     gene_list$symbol = toupper(gene_list$symbol)
+     return(gene_list)
+ }
> missing_genes = function(x, known) {
+     return(x[!toupper(x$symbol) %in% toupper(known$symbol), ])
+ }
> 
> symbols_to_ensembl = function(x) {
+     require(biomaRt)
+     mart = useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
+     x = getBM(mart, filters = "mgi_symbol", attributes = c("mgi_symbol", "ensembl_gene_id"), 
+         values = x)
+     return(x)
+ }
> 
> unigene_to_ensembl = function(x) {
+     require(biomaRt)
+     mart = useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
+     x = getBM(mart, filters = "unigene", attributes = c("unigene", "mgi_symbol", 
+         "ensembl_gene_id"), values = x)
+     return(x)
+ }
> 
> get_go_terms = function() {
+     require(biomaRt)
+     mart = useMart(biomart = "ensembl", dataset = "mmusculus_gene_ensembl")
+     x = getBM(mart = mart, attributes = c("ensembl_gene_id", "mgi_symbol", "go_id", 
+         "name_1006"))
+     return(x)
+ }
> 
> go_terms = get_go_terms()
> apoptosis_list = go_terms[grepl("apopto", go_terms$name_1006), ]
> colnames(apoptosis_list) = c("id", "symbol")
> housekeeping_list = read.table("../metadata/housekeeping.txt")
> colnames(housekeeping_list) = c("unigene")
> housekeeping_list = unigene_to_ensembl(housekeeping_list$unigene)
> colnames(housekeeping_list) = c("unigene", "symbol", "id")
> 
> rod_gene_list = read.table("../metadata/rod_gene_list.txt", header = TRUE)
> glycolysis_list = read.table("../metadata/glycolysis.txt", header = TRUE, sep = "\t")
> calcium_list = read.table("../metadata/calcium_signaling_pathway.txt", header = TRUE, 
+     sep = "\t")
> nrf2_list = read.table("../metadata/oxstressnrf2.txt", header = TRUE, sep = "\t")
> hif1_list = read.table("../metadata/hif1a.txt", header = TRUE, sep = "\t")
> oxidative_list = read.table("../metadata/oxidative_stress.txt", sep = "\t", 
+     header = TRUE)
> 
> normalize_bulk_counts = function(counts) {
+     library(edgeR)
+     counts = combined
+     rownames(counts) = counts$id
+     counts$id = NULL
+     counts$symbol = NULL
+     y = DGEList(counts = counts)
+     y = calcNormFactors(y)
+     counts = cpm(y, normalized.lib.sizes = TRUE)
+     col_order = order(colnames(counts))
+     counts = counts[, col_order]
+     return(counts)
+ }
> counts = normalize_bulk_counts(combined)
> 
> umi_col_order = order(colnames(umi))
> umi = umi[, umi_col_order]
> 
> make_bulk_heatmap = function(counts, gene_list, cexRow = 0.6, cexCol = 0.6, 
+     title = "") {
+     gene_list = unique(gene_list[, c("id", "symbol")])
+     rownames(gene_list) = gene_list$id
+     mat = as.matrix(subset(counts, rownames(counts) %in% gene_list$id))
+     mat = as.matrix(mat)
+     mat = sqrt(mat)
+     
+     rownames(mat) = gene_list[rownames(mat), ]$symbol
+     pheatmap(mat, col = grey(seq(1, 0, -0.01)), cluster_cols = FALSE, cluster_rows = FALSE, 
+         main = title)
+ }
> 
> make_singlecell_heatmap = function(counts, gene_list, cexRow = 0.6, cexCol = 0.6, 
+     title = "") {
+     gene_list = unique(gene_list[, c("id", "symbol")])
+     rownames(gene_list) = gene_list$id
+     mat = as.matrix(subset(counts, rownames(counts) %in% gene_list$id))
+     mat = as.matrix(mat)
+     mat = sqrt(mat)
+     rownames(mat) = gene_list[rownames(mat), ]$symbol
+     pheatmap(mat, col = grey(seq(1, 0, -0.01)), cluster_cols = FALSE, show_colnames = FALSE, 
+         cluster_rows = FALSE, main = title)
+ }

Rod genes

> make_bulk_heatmap(counts, rod_gene_list, title = "Square root of bulk counts for rod genes")

> make_singlecell_heatmap(umi, rod_gene_list, title = "Square root of single cell counts for rod genes")

Apoptosis genes

> make_bulk_heatmap(counts, apoptosis_list, title = "Square root of bulk counts for apoptosis genes")

> make_singlecell_heatmap(umi, apoptosis_list, title = "Square root of single cell counts for apoptosis genes")

Housekeeping genes

> make_bulk_heatmap(counts, housekeeping_list, title = "Square root of bulk counts for housekeeping genes")

> make_singlecell_heatmap(umi, housekeeping_list, title = "Square root of single cell counts for housekeeping genes")

Glycolysis genes

> make_bulk_heatmap(counts, glycolysis_list, title = "Square root of bulk counts for glycolysis genes")

> make_singlecell_heatmap(umi, glycolysis_list, title = "Square root of single cell counts for glycolysis genes")

Calcium genes

> make_bulk_heatmap(counts, calcium_list, title = "Square root of bulk counts for calcium genes")

> make_singlecell_heatmap(umi, calcium_list, title = "Square root of single cell counts for calcium genes")

Nrf2

> make_bulk_heatmap(counts, nrf2_list, title = "Square root of bulk counts for nrf2 genes")

> make_singlecell_heatmap(umi, nrf2_list, title = "Square root of single cell counts for nrf2 genes")

Hif1

> make_bulk_heatmap(counts, hif1_list, title = "Square root of bulk counts for hif1 genes")

> make_singlecell_heatmap(umi, hif1_list, title = "Square root of single cell counts for hif1 genes")

Oxidative genes

> make_bulk_heatmap(counts, oxidative_list, title = "Square root of bulk counts for oxidative genes")

> make_singlecell_heatmap(umi, oxidative_list, title = "Square root of single cell counts for oxidative genes")

> write_tables = function(l, bulk, sc, name) {
+     write.table(subset(bulk, id %in% l$id), file = paste(name, "_bulk.tsv", 
+         sep = ""), quote = FALSE, col.names = TRUE)
+     write.table(subset(sc, rownames(sc) %in% l$id), file = paste(name, "_singlecell.tsv", 
+         sep = ""), quote = FALSE, col.names = TRUE)
+ }
> counts_table = as.data.frame(counts)
> counts_table$id = rownames(counts)
> m = merge(counts_table, combined[, c("id", "symbol")], by = "id", all.x = TRUE)
> write.table(m, file = "bulk_counts.tsv", sep = "\t", quote = FALSE, col.names = TRUE)
> umi_table = as.data.frame(umi)
> umi_table$id = rownames(umi)
> m = merge(umi_table, combined[, c("id", "symbol")], by = "id", all.x = TRUE)
> write.table(m, file = "umi_counts.tsv", sep = "\t", quote = FALSE, col.names = TRUE)